1   package org.apache.lucene.analysis.pt;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.io.LineNumberReader;
24  import java.nio.charset.StandardCharsets;
25  import java.util.ArrayList;
26  import java.util.Arrays;
27  import java.util.HashMap;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.regex.Matcher;
31  import java.util.regex.Pattern;
32  
33  import org.apache.lucene.analysis.util.CharArraySet;
34  
35  import static org.apache.lucene.analysis.util.StemmerUtil.*;
36  
37  /**
38   * Base class for stemmers that use a set of RSLP-like stemming steps.
39   * <p>
40   * RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
41   * originally for stemming the Portuguese language, described in the paper
42   * <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al.
43   * <p>
44   * Since this time a plural-only modification (RSLP-S) as well as a modification
45   * for the Galician language have been implemented. This class parses a configuration
46   * file that describes {@link Step}s, where each Step contains a set of {@link Rule}s.
47   * <p>
48   * The general rule format is: 
49   * <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote>
50   * where:
51   * <ul>
52   *   <li><code>suffix</code> is the suffix to be removed (such as "inho").
53   *   <li><code>N</code> is the min stem size, where stem is defined as the candidate stem 
54   *       after removing the suffix (but before appending the replacement!)
55   *   <li><code>replacement</code> is an optimal string to append after removing the suffix.
56   *       This can be the empty string.
57   *   <li><code>exceptions</code> is an optional list of exceptions, patterns that should 
58   *       not be stemmed. These patterns can be specified as whole word or suffix (ends-with) 
59   *       patterns, depending upon the exceptions format flag in the step header.
60   * </ul>
61   * <p>
62   * A step is an ordered list of rules, with a structure in this format:
63   * <blockquote>{ "name", N, B, { "cond1", "cond2", ... }
64   *               ... rules ... };
65   * </blockquote>
66   * where:
67   * <ul>
68   *   <li><code>name</code> is a name for the step (such as "Plural").
69   *   <li><code>N</code> is the min word size. Words that are less than this length bypass
70   *       the step completely, as an optimization. Note: N can be zero, in this case this 
71   *       implementation will automatically calculate the appropriate value from the underlying 
72   *       rules.
73   *   <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched.
74   *       A value of 1 indicates whole-word pattern matching, a value of 0 indicates that 
75   *       exceptions are actually suffixes and should be matched with ends-with.
76   *   <li><code>conds</code> are an optional list of conditions to enter the step at all. If
77   *       the list is non-empty, then a word must end with one of these conditions or it will
78   *       bypass the step completely as an optimization.
79   * </ul>
80   * <p>
81   * @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a>
82   * @lucene.internal
83   */
84  public abstract class RSLPStemmerBase {
85    
86    /**
87     * A basic rule, with no exceptions.
88     */
89    protected static class Rule {
90      protected final char suffix[];
91      protected final char replacement[];
92      protected final int min;
93      
94      /**
95       * Create a rule.
96       * @param suffix suffix to remove
97       * @param min minimum stem length
98       * @param replacement replacement string
99       */
100     public Rule(String suffix, int min, String replacement) {
101       this.suffix = suffix.toCharArray();
102       this.replacement = replacement.toCharArray();
103       this.min = min;
104     }
105     
106     /**
107      * @return true if the word matches this rule.
108      */
109     public boolean matches(char s[], int len) {
110       return (len - suffix.length >= min && endsWith(s, len, suffix));
111     }
112     
113     /**
114      * @return new valid length of the string after firing this rule.
115      */
116     public int replace(char s[], int len) {
117       if (replacement.length > 0) {
118         System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
119       }
120       return len - suffix.length + replacement.length;
121     }
122   }
123   
124   /**
125    * A rule with a set of whole-word exceptions.
126    */
127   protected static class RuleWithSetExceptions extends Rule {
128     protected final CharArraySet exceptions;
129     
130     public RuleWithSetExceptions(String suffix, int min, String replacement,
131         String[] exceptions) {
132       super(suffix, min, replacement);
133       for (int i = 0; i < exceptions.length; i++) {
134         if (!exceptions[i].endsWith(suffix))
135           throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
136       }
137       this.exceptions = new CharArraySet(Arrays.asList(exceptions), false);
138     }
139 
140     @Override
141     public boolean matches(char s[], int len) {
142       return super.matches(s, len) && !exceptions.contains(s, 0, len);
143     }
144   }
145   
146   /**
147    * A rule with a set of exceptional suffixes.
148    */
149   protected static class RuleWithSuffixExceptions extends Rule {
150     // TODO: use a more efficient datastructure: automaton?
151     protected final char[][] exceptions;
152     
153     public RuleWithSuffixExceptions(String suffix, int min, String replacement,
154         String[] exceptions) {
155       super(suffix, min, replacement);
156       for (int i = 0; i < exceptions.length; i++) {
157         if (!exceptions[i].endsWith(suffix))
158           throw new RuntimeException("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
159       }
160       this.exceptions = new char[exceptions.length][];
161       for (int i = 0; i < exceptions.length; i++)
162         this.exceptions[i] = exceptions[i].toCharArray();
163     }
164     
165     @Override
166     public boolean matches(char s[], int len) {
167       if (!super.matches(s, len))
168         return false;
169       
170       for (int i = 0; i < exceptions.length; i++)
171         if (endsWith(s, len, exceptions[i]))
172           return false;
173 
174       return true;
175     }
176   }
177   
178   /**
179    * A step containing a list of rules.
180    */
181   protected static class Step {
182     protected final String name;
183     protected final Rule rules[];
184     protected final int min;
185     protected final char[][] suffixes;
186     
187     /**
188      * Create a new step
189      * @param name Step's name.
190      * @param rules an ordered list of rules.
191      * @param min minimum word size. if this is 0 it is automatically calculated.
192      * @param suffixes optional list of conditional suffixes. may be null.
193      */
194     public Step(String name, Rule rules[], int min, String suffixes[]) {
195       this.name = name;
196       this.rules = rules;
197       if (min == 0) {
198         min = Integer.MAX_VALUE;
199         for (Rule r : rules)
200           min = Math.min(min, r.min + r.suffix.length);
201       }
202       this.min = min;
203       
204       if (suffixes == null || suffixes.length == 0) {
205         this.suffixes = null;
206       } else {
207         this.suffixes = new char[suffixes.length][];
208         for (int i = 0; i < suffixes.length; i++)
209           this.suffixes[i] = suffixes[i].toCharArray();
210       }
211     }
212     
213     /**
214      * @return new valid length of the string after applying the entire step.
215      */
216     public int apply(char s[], int len) {
217       if (len < min)
218         return len;
219       
220       if (suffixes != null) {
221         boolean found = false;
222         
223         for (int i = 0; i < suffixes.length; i++)
224           if (endsWith(s, len, suffixes[i])) {
225             found = true;
226             break;
227           }
228         
229         if (!found) return len;
230       }
231       
232       for (int i = 0; i < rules.length; i++) {
233         if (rules[i].matches(s, len))
234           return rules[i].replace(s, len);
235       }
236       
237       return len;
238     }
239   }
240   
241   /**
242    * Parse a resource file into an RSLP stemmer description.
243    * @return a Map containing the named Steps in this description.
244    */
245   protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) {
246     // TODO: this parser is ugly, but works. use a jflex grammar instead.
247     try {
248       InputStream is = clazz.getResourceAsStream(resource);
249       LineNumberReader r = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
250       Map<String,Step> steps = new HashMap<>();
251       String step;
252       while ((step = readLine(r)) != null) {
253         Step s = parseStep(r, step);
254         steps.put(s.name, s);
255       }
256       r.close();
257       return steps;
258     } catch (IOException e) {
259       throw new RuntimeException(e);
260     }
261   }
262   
263   private static final Pattern headerPattern = 
264     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
265   private static final Pattern stripPattern = 
266     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
267   private static final Pattern repPattern = 
268     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
269   private static final Pattern excPattern = 
270     Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
271   
272   private static Step parseStep(LineNumberReader r, String header) throws IOException {
273     Matcher matcher = headerPattern.matcher(header);
274     if (!matcher.find()) {
275       throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
276     }
277     assert matcher.groupCount() == 4;
278     String name = matcher.group(1);
279     int min = Integer.parseInt(matcher.group(2));
280     int type = Integer.parseInt(matcher.group(3));
281     String suffixes[] = parseList(matcher.group(4));
282     Rule rules[] = parseRules(r, type);
283     return new Step(name, rules, min, suffixes);
284   }
285   
286   private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
287     List<Rule> rules = new ArrayList<>();
288     String line;
289     while ((line = readLine(r)) != null) {
290       Matcher matcher = stripPattern.matcher(line);
291       if (matcher.matches()) {
292         rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
293       } else {
294         matcher = repPattern.matcher(line);
295         if (matcher.matches()) {
296           rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
297         } else {
298           matcher = excPattern.matcher(line);
299           if (matcher.matches()) {
300             if (type == 0) {
301               rules.add(new RuleWithSuffixExceptions(matcher.group(1), 
302                         Integer.parseInt(matcher.group(2)), 
303                         matcher.group(3), 
304                         parseList(matcher.group(4))));
305             } else {
306               rules.add(new RuleWithSetExceptions(matcher.group(1), 
307                         Integer.parseInt(matcher.group(2)), 
308                         matcher.group(3), 
309                         parseList(matcher.group(4))));
310             }
311           } else {
312             throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
313           }
314         }
315       }
316       if (line.endsWith(";"))
317         return rules.toArray(new Rule[rules.size()]);
318     }
319     return null;
320   }
321   
322   private static String[] parseList(String s) {
323     if (s.length() == 0)
324       return null;
325     String list[] = s.split(",");
326     for (int i = 0; i < list.length; i++)
327       list[i] = parseString(list[i].trim());
328     return list;
329   }
330   
331   private static String parseString(String s) {
332     return s.substring(1, s.length()-1);
333   }
334   
335   private static String readLine(LineNumberReader r) throws IOException {
336     String line = null;
337     while ((line = r.readLine()) != null) {
338       line = line.trim();
339       if (line.length() > 0 && line.charAt(0) != '#')
340         return line;
341     }
342     return line;
343   }
344 }